{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 白人和黑人在求职路上会有种族的歧视吗？"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy import stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.io.stata.read_stata('us_job_market_discrimination.dta')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>ad</th>\n",
       "      <th>education</th>\n",
       "      <th>ofjobs</th>\n",
       "      <th>yearsexp</th>\n",
       "      <th>honors</th>\n",
       "      <th>volunteer</th>\n",
       "      <th>military</th>\n",
       "      <th>empholes</th>\n",
       "      <th>occupspecific</th>\n",
       "      <th>...</th>\n",
       "      <th>compreq</th>\n",
       "      <th>orgreq</th>\n",
       "      <th>manuf</th>\n",
       "      <th>transcom</th>\n",
       "      <th>bankreal</th>\n",
       "      <th>trade</th>\n",
       "      <th>busservice</th>\n",
       "      <th>othservice</th>\n",
       "      <th>missind</th>\n",
       "      <th>ownership</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>316</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>19</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>313</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>22</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>313</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Nonprofit</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 65 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  id ad  education  ofjobs  yearsexp  honors  volunteer  military  empholes  \\\n",
       "0  b  1          4       2         6       0          0         0         1   \n",
       "1  b  1          3       3         6       0          1         1         0   \n",
       "2  b  1          4       1         6       0          0         0         0   \n",
       "3  b  1          3       4         6       0          1         0         1   \n",
       "4  b  1          3       3        22       0          0         0         0   \n",
       "\n",
       "   occupspecific    ...      compreq  orgreq  manuf  transcom  bankreal trade  \\\n",
       "0             17    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
       "1            316    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
       "2             19    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
       "3            313    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
       "4            313    ...          1.0     1.0    0.0       0.0       0.0   0.0   \n",
       "\n",
       "  busservice othservice  missind  ownership  \n",
       "0        0.0        0.0      0.0             \n",
       "1        0.0        0.0      0.0             \n",
       "2        0.0        0.0      0.0             \n",
       "3        0.0        0.0      0.0             \n",
       "4        0.0        1.0      0.0  Nonprofit  \n",
       "\n",
       "[5 rows x 65 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "blacks = data[data.race == 'b']\n",
    "whites = data[data.race == 'w']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    2435.000000\n",
       "mean        0.064476\n",
       "std         0.245649\n",
       "min         0.000000\n",
       "25%         0.000000\n",
       "50%         0.000000\n",
       "75%         0.000000\n",
       "max         1.000000\n",
       "Name: call, dtype: float64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "blacks.call.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    2435.000000\n",
       "mean        0.096509\n",
       "std         0.295346\n",
       "min         0.000000\n",
       "25%         0.000000\n",
       "50%         0.000000\n",
       "75%         0.000000\n",
       "max         1.000000\n",
       "Name: call, dtype: float64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "whites.call.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 卡方检验\n",
    "\n",
    "* 白人获得职位\n",
    "* 白人被拒绝\n",
    "* 黑人获得职位\n",
    "* 黑人被拒绝\n",
    "\n",
    "## 假设检验\n",
    "- H0：种族对求职结果没有显著影响\n",
    "- H1：种族对求职结果有影响"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "blacks_called = len(blacks[blacks['call'] == True])\n",
    "blacks_not_called = len(blacks[blacks['call'] == False])\n",
    "whites_called = len(whites[whites['call'] == True])\n",
    "whites_not_called = len(whites[whites['call'] == False])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "observed = pd.DataFrame({'blacks': {'called': blacks_called, 'not_called': blacks_not_called},\n",
    "                         'whites': {'called' : whites_called, 'not_called' : whites_not_called}})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>blacks</th>\n",
       "      <th>whites</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>called</th>\n",
       "      <td>157</td>\n",
       "      <td>235</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>not_called</th>\n",
       "      <td>2278</td>\n",
       "      <td>2200</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            blacks  whites\n",
       "called         157     235\n",
       "not_called    2278    2200"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "observed\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "392\n",
      "4478\n"
     ]
    }
   ],
   "source": [
    "num_called_back = blacks_called + whites_called\n",
    "num_not_called = blacks_not_called + whites_not_called\n",
    "\n",
    "print(num_called_back)\n",
    "print(num_not_called)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rate_of_callbacks = num_called_back / (num_not_called + num_called_back)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "得到期望的比率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.08049281314168377"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rate_of_callbacks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "expected_called = len(data)  * rate_of_callbacks\n",
    "expected_not_called = len(data)  * (1 - rate_of_callbacks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "391.99999999999994\n",
      "4478.0\n"
     ]
    }
   ],
   "source": [
    "print(expected_called)\n",
    "print(expected_not_called)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Power_divergenceResult(statistic=16.879050414270221, pvalue=0.00074839594410972638)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import scipy.stats as stats\n",
    "observed_frequencies = [blacks_not_called, whites_not_called, whites_called, blacks_called]\n",
    "expected_frequencies = [expected_not_called/2, expected_not_called/2, expected_called/2, expected_called/2]\n",
    "\n",
    "\n",
    "stats.chisquare(f_obs = observed_frequencies,\n",
    "                f_exp = expected_frequencies)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "看起来种族歧视是存在的！"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
